# a real world data set
#
#  I went out and did some "research" to find the EPA highway
#  mpg and the curb weight (in pounds) of a number of SUV's.
#
#  I placed that information into a comma separated file,
#  curb_mpg.csv.  Within the context of this course, we 
#  could use the c() function to just get that information into R.
#  That is done in the following lines.
highway_mpg <- c(31, 31, 31, 26, 27, 28, 25, 20, 21,
                 33, 19, 27, 31, 17)
curb_weight <- c(3124, 3325, 2996, 4253, 4123, 3665, 4362, 5473,5808,
                 3490, 4750, 4370, 3300, 5730)

SUVs <- c("Trax", "Equinox sml", "Trailblazer", "Blazer sml",
          "Blazer lrg", "Equinox lrg","Traverse",
          "Tahoe","Suburban", "Rav4", "4Runner", 
          "Highlander", "C-HR", "Sequoia")
#
# Then, we might look at the relation between curb weight and mpg.
# First, we will create a plot of the two values.

plot(curb_weight,highway_mpg,
     main = "Sample of some SUV curb weight vs highway mpg",
     xlim=c(2500,6000), ylim=c(18,34),
     xaxp=c(2500,6000,7), yaxp=c(18,34,8), las=2)
abline( h=seq(18,34,2), v=seq(2500,6000,500),
        col="darkgrey",lty="dotted")

# One thing to note is that the mpg is given in whole numbers.
#    That forces a lot of values onto the same horizontal level.


#  The graph suggests a linear relationship where as the curb weight 
#  goes up, the mpg goes down. We can look at the strength of that
#  relationship by finding the correlation coefficient.

cor( curb_weight,highway_mpg)

#  That answer, -0.9300396, is pretty strong.  If we look 
#   at the square of that value it will tell us the percent of the
#   variation shown in the data that is explained by the linear 
#   model (even though we, at this point, have not found that model).

(-0.9300396)^2

#  From this we see that over 86% of the variablility is explained
#  by the best linear model.  So, let us find the coefficients,
#  a and b, for the equation of the line, y= a + b*x, that is the 
#  best fit for our data.  Note that means we will have
#        mpg = a + b * curb_weight
#
#  We will call our model "ratio"  

ratio <- lm(  highway_mpg ~ curb_weight )
ratio
#  This tells us that the line of best fit looks like
#      mpg = 47.28854 + (-0.00502) * curb_weight

#  We can graph that line on our plot, and do it in red, with
#  a line width=2

abline( ratio, col="red", lwd=2)

#  from this we can predict the mpg for a vehicle that 
#  has a curb weight of 5000 pounds via
47.28854 + (-0.00502) * 5000

# knowing that the predicted value is 22.18854 we could even 
# plot that point, (5000, 22.18854) on our graph via
points( 5000, 22.18854, col="blue", pch=22 )
#   Because 5000 is within the range of observed values for
#   our independent variable, the x-variable, curb_weight, this
#   is an interpolation.  We can feel pretty confident of this 
#   value.

#  In the same way, we could predict the mpg  for a 
#  vehicle that weighs 7500 via
47.28854 + (-0.00502) * 7500

#  that value, 9.63854, is interesting but we should not
#  have a lot of confidence in it because the independent 
#  value, 7500, is not within the observed range of our data.
#  That makes this an extrapolation and we should be hesitant
#  to suggest that such a value is accurate.

#  one other thing that we should check is the distribution 
#  of the residual values.  The long way to find all of those
#  residual values is to first find all of the expected values.
ev_s <- 47.28854 + (-0.00502) * curb_weight
ev_s
#  We could even add all of those values, which will be on our
#  line of best fit, via
points( curb_weight, ev_s, col="green", pch=15)
#  But, what we really want is the residual values, that is,
#  the observed - expected values
resid_vals <- highway_mpg - ev_s
resid_vals

#  Then we want to see the scatter plot of the
#  curb weight and the residuals
#  That will be a new graph
plot( curb_weight, resid_vals )
#  We will be satisfied if the points are all over the
#  place, not forming any particular pattern.

#  Note that because we saved the linear model in "ratio"
#  we could have found the same residual values via
residuals( ratio )  # the results will be slightly different
# because our earlier calculation used the
# rounded values for  a  and   b, the 
# intercept and the slope.

#  Or, we could have produced our graph via
plot( curb_weight, residuals( ratio ),
      main="New residual plot")

#  A bit more than is required in this class, we could
#  have put the data into a data frame so that we could get a 
#  slightly better listing of the data

our_data <- data.frame( SUVs, curb_weight, highway_mpg)
our_data
#  or as
View( our_data )  # be sure to not the capital V

#  Also beyond the scope of this course, we could have just
#  read the data from the file right into a data frame
new_data <- read.csv("curb_mpg.csv")
new_data
#  Then we can reference the separate columns in the data frame
#  by using the data frame name followed by a $ followed by the
#  column name.
new_data$curb_weight

#####################################
#  We have seen an example of real world data. And, for that
#  data we had to enter the data via lines 10-18 above, or 
#  by reading the data from the file, line 129 above.  

#  At times I have students ask why I use gnrnd4 to just 
#  generate random data rather than using real world data.
#  The answer is three fold.  First, via gnrnd4 I can get
#  newly generated data for each example (that includes
#  test questions).  Second, just running gnrnd4 puts the
#  data into variables, L1 and L2, in our RStudio session.
#  Third, via gnrnd4, or gnrnd5, I can mimic just about
#  any real world data.  Look at the following:

source("../gnrnd5.R")
gnrnd5( 23405001406, 4399456001200,2900002950)
plot( L1, L2,
      xlim=c(2500,6000), ylim=c(18,34),
      xaxp=c(2500,6000,7), yaxp=c(18,34,8), las=2)
abline( h=seq(18,34,2), v=seq(2500,6000,500),
        col="darkgrey",lty="dotted")
cor( L1,L2)
new_ratio <- lm(L2~L1)
new_ratio
abline( new_ratio, col="red")
#   That is hardly any different from the real world data above.

##################################
#  Having said that, let us take a look at two more cases.
#  First generate the data
source( "../gnrnd4.R")
gnrnd4( 1770222306, 4840717035, 35700076)
#  look at our values
L1
L2
#  or using the little extra that we saw above
data.frame(L1,L2)
#   Then, get a first, ugly plot of the data
plot( L1,L2)
#   Now, to be fancy, improve the plot
plot( L1, L2, main=" our new example ",
      xlim=c(-10,50), ylim=c(-10,100), las=1,
      xaxp=c(-10,50,12), yaxp=c(-10,100,22), cex.axis=0.7)
#  add some light horizontal and vertical lines
abline( h=seq(-10,100, 5), v=seq(-10,50,5),
        col="darkgray", lty="dotted")
#  add the axes
abline( h=0, v=0, col="black")

#  A linear model looks appropriate, find the correlation
#  coefficient
cor( L1, L2 )
#   Just for our reference, square that
cor( L1, L2 )^2   #  this will make poor R redo the computation

#   Now we will get the linear model, i.e., the coefficients
#   We will store the model in lm_2
lm_2 <- lm( L2 ~ L1 )
#   Look at the coefficients
lm_2
#   so our model is L2 = -5.840 + 2.138*L1
#   We will graph that line, in purple
abline( lm_2, col="purple", lwd=2)
#
#   Use the regression equation to predict the value of L2 if 
#   L1 is 28.3.
-5.840 + 2.138*28.3
#   Graph that point
points( 28.3, 54.6654, col="darkorange", pch=13)
#  note that this is an interpolation because 28.3 is 
#  within the observed range of L1.


#   Find the residual value when L1 is 38.1
#   We note that 38.1 is the 6th value in L1,  the corresponding
#   L2 value, the one in position 6, is 82.0.  That is the observed
#   value.  To get the expected value when L1 is 38.1 we compute
#
-5.840 + 2.138*38.1
#   So the residual value will be  the observed - the expected
82.0 - 75.6178

#  of course, knowing that the ordered pair that we want
#  to use is the 6th ordered pair, we could have just done
residuals( lm_2  )  # and found the 6th answer
#  again, the difference is due to the rounding error for 
#  our coefficients.  We could have found better values 
#  for the intercept and slope via
coeffs <- coefficients( lm_2 )
coeffs
# These are still rounded, but now to 6 decimal places
# therefore the expected value when L1 is 38.1 becomes
-5.839503 + 2.137889  *  38.1
#  And the residual is now
82.0 - 75.61407
#  or we could go a bit more crazy and use the full 
#  16 digit value for the coefficients to get the 
#  expected value 
ev_2 <- coeffs[1] + coeffs[2]*38.1
ev_2
# and now the full residual
82.0 - ev_2

#  Finally, check the distribution of the residuals
plot( L1, residuals( lm_2) )


#  and now a new example
gnrnd4( 6513562106, 7166322108, 88700176)
#  look at our values
L1
L2
#  or using the little extra that we saw above
data.frame(L1,L2)
#   Then, get a first, ugly plot of the data
plot( L1,L2)
summary(L1)
summary(L2)
#   Now, to be fancy, improve the plot
plot( L1, L2, main=" our new example ",
      xlim=c(-10,70), ylim=c(-160,160), las=1,
      xaxp=c(-10,70,16), yaxp=c(-160,160,16), cex.axis=0.7)
#  add some light horizontal and vertical lines
abline( h=seq(-160,160, 20), v=seq(-10,70,5),
        col="darkgray", lty="dotted")
#  add the axes
abline( h=0, v=0, col="black")

# find the correlation coefficient
cc <- cor( L1, L2 )
cc
# look at the square of the correlation coefficient
cc^2  # this time R did not have to recompute 
#   not such a good value

# find the best line
best_line <- lm( L2 ~ L1 )
best_line
#  plot the best line
abline( best_line )

# use the full equation to predict L2 when L1=20.3
coeffs <- coefficients( best_line )
coeffs
coeffs[1]+coeffs[2]*20.3
#   just to be sure, plot that point
points( 20.3, 43.17139, col="red", pch=17)

#   Look at the distribution of the residuals
plot( L1, residuals( best_line ))